#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(expss)
library(polycor)
library(foreach) ; library(doParallel)
library(knitr)
library(biomaRt)
library(anRichment) ; library(BrainDiseaseCollection)
suppressWarnings(suppressMessages(library(WGCNA)))

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load preprocessed dataset (preprocessing code in 20_02_28_data_preprocessing.Rmd) and clustering (pipeline in 20_02_28_WGCNA.Rmd)

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame


# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Clusterings
clusterings = read_csv('./../Data/clusters.csv')


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% left_join(clusterings, by='ID') %>%
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`), 
         significant=padj<0.05 & !is.na(padj))

# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org')
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=genes_info$ID, mart=mart)

genes_info = genes_info %>% left_join(gene_names, by=c('ID'='ensembl_gene_id'))


clustering_selected = 'DynamicHybridMergedSmall'
genes_info$Module = genes_info[,clustering_selected]

dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]


rm(DE_info, GO_annotations, clusterings, getinfo, mart, dds)


Relation to external clinical traits


Quantifying module-trait associations


Using the hetcor function, that calculates Pearson, polyserial or polychoric correlations depending on the type of variables involved.

datTraits = datMeta %>% dplyr::select(Diagnosis, Sex, Age, PMI, RNAExtractionBatch) %>%
            dplyr::rename('ExtractionBatch' = RNAExtractionBatch)

# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)

# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)

# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
  print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated')) 
}
## [1] "2 correlation(s) could not be calculated"
rm(ME_object)

Note: The correlations between a Modules and Diagonsis that cannot be calculated, weirdly enough, is because the initial correlation is too high, so it would be a very bad thing to lose these modules because of this numerical error. I’m going to fill in the values using the polyserial function, which doesn’t give exactly the same results as the hetcor() function, but it’s quite similar.

# Calculate the correlation tha failed with hetcor()
missing_modules = rownames(moduleTraitCor)[is.na(moduleTraitCor[,1])]

for(m in missing_modules){
  cat(paste0('Correcting Module-Diagnosis correlation for Module ', m))
  moduleTraitCor[m,'Diagnosis'] = polyserial(MEs[,m], datTraits$Diagnosis)  
}
## Correcting Module-Diagnosis correlation for Module ME#D69100
## Warning in polyserial(MEs[, m], datTraits$Diagnosis): initial correlation
## inadmissible, 1.05417437498074, set to 0.9999
rm(missing_modules)

I’m going to select all the modules that have an absolute correlation higher than 0.9 with Diagnosis to study them

# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)


labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels =  gsub('ME','',rownames(moduleTraitCor)), 
               yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
               textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
               main = paste('Module-Trait relationships'))

diagnosis_cor = data.frame('Module' = gsub('ME','',rownames(moduleTraitCor)),
                           'MTcor' = moduleTraitCor[,'Diagnosis'],
                           'MTpval' = moduleTraitPvalue[,'Diagnosis'])

genes_info = genes_info %>% left_join(diagnosis_cor, by='Module')

rm(moduleTraitPvalue, datTraits, textMatrix, diagnosis_cor)


Studying the modules with the highest absolute correlation to Diagnosis


The modules consist mainly of points with very high (absolute) values in PC2 (which we know is related to lfc), so this result is consistent with the high correlation between Module and Diagnosis, although some of the points with the highest PC2 values do not belong to these top modules.

The second module is quite small.

top_modules = gsub('ME','',rownames(moduleTraitCor)[abs(moduleTraitCor[,'Diagnosis'])>0.9])

cat(paste0('Top modules selected: ', paste(top_modules, collapse=', '),'\n'))
## Top modules selected: #D69100, #D873FC
pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% left_join(genes_info %>% dplyr::select(ID, external_gene_id), by='ID') %>%
            dplyr::select(ID, external_gene_id, PC1, PC2, Module, gene.score) %>%
            mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
            mutate(color = ifelse(ImportantModules=='Others','gray',ImportantModules),
                   alpha = ifelse(ImportantModules=='Others', 0.2, 0.4),
                   gene_id = paste0(ID, ' (', external_gene_id, ')'))

table(plot_data$ImportantModules)
## 
## #D69100 #D873FC  Others 
##     891      23   15239
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + 
         geom_point(alpha=plot_data$alpha, color=plot_data$color, aes(ID=gene_id)) + theme_minimal() + 
           ggtitle('Modules with strongest relation to Diagnosis'))
rm(pca)




Module Membership vs Gene Significance


create_plot = function(module){
  
  plot_data = dataset %>% dplyr::select(ID, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% filter(dataset$Module==module)
  colnames(plot_data)[2] = 'Module'
  
  SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='None'])))
  
  p = ggplotly(plot_data %>% ggplot(aes(Module, GS, color=gene.score)) + geom_point(alpha=0.5, aes(ID=ID)) + ylab('Gene Significance') +
               scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,8))) + theme_minimal() + xlab('Module Membership') +
               ggtitle(paste0('Module ', module,'  (MTcor = ', round(moduleTraitCor[paste0('ME',module),1],2),')')))
  
  return(p)
}

create_plot(top_modules[1])
create_plot(top_modules[2])
rm(create_plot)




SFARI Genes


List of top SFARI Genes in top modules ordered by SFARI score and Gene Significance

table_data = dataset %>% left_join(genes_info %>% dplyr::select(ID, external_gene_id), by='ID') %>%
             dplyr::select(ID, external_gene_id, GS, gene.score, Module) %>% arrange(gene.score, desc(abs(GS))) %>%
             dplyr::rename('Ensembl ID'=ID, 'Gene Symbol'=external_gene_id, 
                    'SFARI score'=gene.score, 'Gene Significance'=GS)

kable(table_data %>% filter(Module == top_modules[1] & `SFARI score` %in% c(1,2,3)) %>% dplyr::select(-Module),
      caption=paste0('Top SFARI Genes for Module ', top_modules[1]))
Top SFARI Genes for Module #D69100
Ensembl ID Gene Symbol Gene Significance SFARI score
ENSG00000153827 TRIP12 0.6566301 1
ENSG00000164134 NAA15 0.6471031 1
ENSG00000110066 SUV420H1 NA 1
ENSG00000108510 MED13 0.8332876 2
ENSG00000114861 FOXP1 0.7136441 2
ENSG00000143621 ILF2 0.6583689 2
ENSG00000095787 WAC 0.0676702 2
ENSG00000077063 CTTNBP2 0.8704863 3
ENSG00000196628 TCF4 0.8588041 3
ENSG00000146247 PHIP 0.8284574 3
ENSG00000151693 ASAP2 0.8256907 3
ENSG00000109911 ELP4 0.8082447 3
ENSG00000135387 CAPRIN1 0.7634933 3
ENSG00000095564 BTAF1 0.7496291 3
ENSG00000008083 JARID2 0.7452185 3
ENSG00000066279 ASPM 0.6645671 3
ENSG00000183454 GRIN2A 0.6178399 3
ENSG00000112902 SEMA5A 0.5850774 3
ENSG00000112655 PTK7 0.5145820 3
ENSG00000089006 SNX5 0.4367331 3
ENSG00000171723 GPHN 0.3586154 3
ENSG00000205581 HMGN1 0.3562783 3
kable(table_data %>% filter(Module == top_modules[2] & `SFARI score` %in% c(1,2,3)) %>% dplyr::select(-Module),
      caption=paste0('Top SFARI Genes for Module ', top_modules[2]))
Top SFARI Genes for Module #D873FC
Ensembl ID Gene Symbol Gene Significance SFARI score

Modules with the strongest module-diagnosis correlation should have the highest percentage of SFARI Genes, but this doesn’t seem to be the case

plot_data = dataset %>% mutate('hasSFARIscore' = gene.score!='None') %>% 
            group_by(Module, MTcor, hasSFARIscore) %>% summarise(p=n()) %>% 
            left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>% 
            mutate(p=round(p/n*100,2)) 

for(i in 1:nrow(plot_data)){
  this_row = plot_data[i,]
  if(this_row$hasSFARIscore==FALSE & this_row$p==100){
    new_row = this_row
    new_row$hasSFARIscore = TRUE
    new_row$p = 0
    plot_data = plot_data %>% rbind(new_row)
  }
}

plot_data = plot_data %>% filter(hasSFARIscore==TRUE)

ggplotly(plot_data %>% ggplot(aes(MTcor, p, size=n)) + geom_smooth(color='gray', se=FALSE) +
         geom_point(color=plot_data$Module, alpha=0.5, aes(id=Module)) + geom_hline(yintercept=mean(plot_data$p), color='gray') +
         xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
         theme_minimal() + theme(legend.position = 'none'))
rm(i, this_row, new_row, plot_data)

Breaking the SFARI genes by score

scores = c(1,2,3,4,5,6,'None')

plot_data = dataset %>% group_by(Module, MTcor, gene.score) %>% summarise(n=n()) %>% 
            left_join(dataset %>% group_by(Module) %>% summarise(N=n()), by='Module') %>% 
            mutate(p=round(n/N*100,2), gene.score = as.character(gene.score))

for(i in 1:nrow(plot_data)){
  this_row = plot_data[i,]
  if(sum(plot_data$Module == this_row$Module)<7){
    missing_scores = which(! scores %in% plot_data$gene.score[plot_data$Module == this_row$Module])
    for(s in missing_scores){
      new_row = this_row
      new_row$gene.score = as.character(s)
      new_row$n = 0
      new_row$p = 0
      plot_data = plot_data %>% rbind(new_row) 
    }
  }
}

plot_data = plot_data %>% filter(gene.score != 'None')

plot_function = function(i){
  i = 2*i-1
  plot_list = list()
  for(j in 1:2){
    plot_list[[j]] = ggplotly(plot_data %>% filter(gene.score==scores[i+j-1]) %>% ggplot(aes(MTcor, p, size=n)) + 
                geom_smooth(color='gray', se=FALSE) +
                geom_point(color=plot_data$Module[plot_data$gene.score==scores[i+j-1]], alpha=0.5, aes(id=Module)) +
                geom_hline(yintercept=mean(plot_data$p[plot_data$gene.score==scores[i+j-1]]), color='gray') +
                xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
                theme_minimal() + theme(legend.position = 'none'))
  }
  p = subplot(plot_list, nrows=1) %>% layout(annotations = list(
    list(x = 0.2 , y = 1.05, text = paste0('SFARI score ', scores[i]), showarrow = F, xref='paper', yref='paper'),
    list(x = 0.8 , y = 1.05, text = paste0('SFARI score ', scores[i+1]), showarrow = F, xref='paper', yref='paper')))
  
  return(p)
}

plot_function(1)
plot_function(2)
plot_function(3)
rm(i, s, this_row, new_row, plot_function)




Module Eigengenes


Since these modules have the strongest relation to autism, this pattern should be reflected in their model eigengenes, having two different behaviours for the samples corresponding to autism and the ones corresponding to control.

In both cases, the Eigengenes separate the behaviour between autism and control samples very clearly!

plot_EGs = function(module){

  plot_data = data.frame('ID' = rownames(MEs), 'MEs' = MEs[,paste0('ME',module)], 'Diagnosis' = datMeta$Diagnosis)

  p = plot_data %>% ggplot(aes(Diagnosis, MEs, fill=Diagnosis)) + geom_boxplot() + theme_minimal() + theme(legend.position='none') +
                    ggtitle(paste0('Module ', module, '  (MTcor=',round(moduleTraitCor[paste0('ME',module),1],2),')'))
  return(p)
}

p1 = plot_EGs(top_modules[1])
p2 = plot_EGs(top_modules[2])

grid.arrange(p1, p2, nrow=1)

rm(plot_EGs, p1, p2)




Identifying important genes


Selecting the modules with the highest correlation to Diagnosis, and, from them, the genes with the highest module membership-(absolute) gene significance

*Ordered by \(\frac{MM+|GS|}{2}\)

There aren’t any SFARI genes in the top genes of each module

create_table = function(module){
  top_genes = dataset %>% left_join(genes_info %>% dplyr::select(ID, external_gene_id), by='ID') %>% 
              dplyr::select(ID, external_gene_id, paste0('MM.',gsub('#','',module)), GS, gene.score) %>%
              filter(dataset$Module==module) %>% dplyr::rename('MM' = paste0('MM.',gsub('#','',module))) %>% 
              mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>% top_n(20)
  return(top_genes)
}

top_genes = list()
for(i in 1:length(top_modules)) top_genes[[i]] = create_table(top_modules[i])

kable(top_genes[[1]], caption=paste0('Top 10 genes for module ', top_modules[1], '  (MTcor = ',
                                  round(moduleTraitCor[paste0('ME',top_modules[1]),1],2),')'))
Top 10 genes for module #D69100 (MTcor = 1)
ID external_gene_id MM GS gene.score importance
ENSG00000137642 SORL1 0.8978882 0.9938396 None 0.9458639
ENSG00000124151 NCOA3 0.8837156 0.9999000 None 0.9418078
ENSG00000244405 ETV5 0.8778146 0.9734862 None 0.9256504
ENSG00000129460 NGDN 0.8547651 0.9888390 None 0.9218021
ENSG00000161326 DUSP14 0.9534829 0.8873380 None 0.9204105
ENSG00000139318 DUSP6 0.9153753 0.8778564 None 0.8966158
ENSG00000079156 OSBPL6 0.8357863 0.9504505 None 0.8931184
ENSG00000081923 ATP8B1 0.8369237 0.9460857 None 0.8915047
ENSG00000138698 RAP1GDS1 0.9144738 0.8574519 None 0.8859628
ENSG00000165097 KDM1B 0.8777870 0.8811877 None 0.8794873
ENSG00000100603 SNW1 0.8065422 0.9258086 None 0.8661754
ENSG00000172071 EIF2AK3 0.8424813 0.8897561 None 0.8661187
ENSG00000198576 ARC 0.8165030 0.9057289 None 0.8611159
ENSG00000068878 PSME4 0.8554311 0.8654616 None 0.8604464
ENSG00000125740 FOSB 0.8198621 0.8996153 None 0.8597387
ENSG00000050438 SLC4A8 0.8382723 0.8810656 None 0.8596690
ENSG00000146263 MMS22L 0.7148092 0.9999000 None 0.8573546
ENSG00000115128 SF3B14 0.8382295 0.8752307 None 0.8567301
ENSG00000120738 EGR1 0.8438816 0.8563020 None 0.8500918
ENSG00000067248 DHX29 0.8873499 0.8119900 None 0.8496700
kable(top_genes[[2]], caption=paste0('Top 10 genes for module ', top_modules[2], '  (MTcor = ',
                                  round(moduleTraitCor[paste0('ME',top_modules[2]),1],2),')'))
Top 10 genes for module #D873FC (MTcor = -0.94)
ID external_gene_id MM GS gene.score importance
ENSG00000178386 ZNF223 0.8617625 -0.9953450 None 0.9285537
ENSG00000160345 C9orf116 0.9313142 -0.9104685 None 0.9208914
ENSG00000146707 POMZP3 0.8576356 -0.9101464 None 0.8838910
ENSG00000188312 CENPP 0.7649239 -0.9999000 None 0.8824120
ENSG00000114378 HYAL1 0.7830037 -0.9664075 None 0.8747056
ENSG00000137944 CCBL2 0.8838408 -0.8586103 None 0.8712256
ENSG00000105173 CCNE1 0.8244102 -0.9076695 None 0.8660398
ENSG00000101898 RP3-324O17.4 0.8834248 -0.8345949 None 0.8590099
ENSG00000196812 ZSCAN16 0.7470864 -0.9369451 None 0.8420157
ENSG00000101639 CEP192 0.7596794 -0.9180661 None 0.8388728
ENSG00000101213 PTK6 0.8061838 -0.8514060 None 0.8287949
ENSG00000159618 GPR114 0.7763124 -0.8201560 None 0.7982342
ENSG00000181929 PRKAG1 0.8034814 -0.7814905 None 0.7924859
ENSG00000137463 MGARP 0.8036502 -0.7694300 None 0.7865401
ENSG00000166347 CYB5A 0.7328774 -0.7867791 None 0.7598282
ENSG00000235878 AP001468.1 0.7475696 -0.6616010 None 0.7045853
ENSG00000185000 DGAT1 0.7176196 -0.6893801 None 0.7034999
ENSG00000163808 KIF15 0.7772614 -0.6017613 None 0.6895113
ENSG00000126247 CAPNS1 0.7051798 -0.6683166 None 0.6867482
ENSG00000239887 C1orf226 0.6326670 -0.7201532 None 0.6764101
rm(create_table)
pca = datExpr %>% prcomp

ids = c()
for(tg in top_genes) ids = c(ids, tg$ID)

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
            mutate(color = ifelse(Module %in% top_modules, as.character(Module), 'gray')) %>%
            mutate(alpha = ifelse(color %in% top_modules & 
                                  ID %in% ids, 1, 0.1))

plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) + 
              theme_minimal() + ggtitle('Important genes identified through WGCNA')

Level of expression by Diagnosis for top genes, ordered by importance (defined above)

create_plot = function(i){
  
  plot_data = datExpr[rownames(datExpr) %in% top_genes[[i]]$ID,] %>% mutate('ID' = rownames(.)) %>% 
              melt(id.vars='ID') %>% mutate(variable = gsub('X','',variable)) %>%
              left_join(top_genes[[i]], by='ID') %>%
              left_join(datMeta %>% dplyr::select(Dissected_Sample_ID, Diagnosis),
                        by = c('variable'='Dissected_Sample_ID')) %>% arrange(desc(importance))
  
  p = ggplotly(plot_data %>% mutate(external_gene_id=factor(external_gene_id, 
                                    levels=unique(plot_data$external_gene_id), ordered=T)) %>%
               ggplot(aes(external_gene_id, value, fill=Diagnosis)) + geom_boxplot() + theme_minimal() +
                      xlab(paste0('Top genes for module ', top_modules[i], ' (MTcor = ',
                       round(genes_info$MTcor[genes_info$Module==top_modules[i]][1],2), ')')) + ylab('Level of Expression') +
                      theme(axis.text.x = element_text(angle = 90, hjust = 1)))
  return(p)
  
}

create_plot(1)
create_plot(2)
rm(create_plot)




Enrichment Analysis


Using the package anRichment

# Prepare dataset

# Create dataset with top modules membership and removing the genes without an assigned module
EA_dataset = data.frame('ensembl_gene_id' = genes_info$ID,
                        module = ifelse(genes_info$Module %in% top_modules, genes_info$Module, 'other')) %>%
             filter(genes_info$Module!='gray')

# Assign Entrez Gene Id to each gene
getinfo = c('ensembl_gene_id','entrezgene')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org')
biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=EA_dataset$ensembl_gene_id, mart=mart)
## Cache found
EA_dataset = EA_dataset %>% left_join(biomart_output, by='ensembl_gene_id')

for(tm in top_modules){
  cat(paste0('\n',sum(EA_dataset$module==tm & is.na(EA_dataset$entrezgene)), ' genes from top module ',
             tm, ' don\'t have an Entrez Gene ID')) 
}
## 
## 13 genes from top module #D69100 don't have an Entrez Gene ID
## 0 genes from top module #D873FC don't have an Entrez Gene ID
rm(getinfo, mart, biomart_output, tm)
# Manual: https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/GeneAnnotation/Tutorials/anRichment-Tutorial1.pdf

collectGarbage()

# EA_dataset = rbind(EA_dataset[EA_dataset$module!='other',], EA_dataset[EA_dataset$module=='other',][sample(sum(EA_dataset$module=='other'), 1000),])

# Prepare datasets
GO_col = buildGOcollection(organism = 'human', verbose = 0)
## Loading required package: org.Hs.eg.db
## 
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
internal_col = internalCollection(organism = 'human')
MillerAIBS_col = MillerAIBSCollection(organism = 'human')
BrainDisease_col = BrainDiseaseCollection(organism = 'human')
combined_col = mergeCollections(GO_col, internal_col, MillerAIBS_col, BrainDisease_col)

# Print collections used
cat('Using collections: ')
## Using collections:
knownGroups(combined_col, sortBy = 'size')
##  [1] "GO"                                         
##  [2] "GO.BP"                                      
##  [3] "GO.MF"                                      
##  [4] "GO.CC"                                      
##  [5] "JA Miller at AIBS"                          
##  [6] "Chip-X enrichment analysis (ChEA)"          
##  [7] "Brain"                                      
##  [8] "JAM"                                        
##  [9] "Prenatal brain"                             
## [10] "Brain region markers"                       
## [11] "Cortex"                                     
## [12] "Brain region marker enriched gene sets"     
## [13] "WGCNA"                                      
## [14] "BrainRegionMarkers"                         
## [15] "BrainRegionMarkers.HBA"                     
## [16] "BrainRegionMarkers.HBA.localMarker(top200)" 
## [17] "Postnatal brain"                            
## [18] "ImmunePathways"                             
## [19] "Markers of cortex layers"                   
## [20] "BrainLists"                                 
## [21] "Cell type markers"                          
## [22] "Germinal brain"                             
## [23] "BrainRegionMarkers.HBA.globalMarker(top200)"
## [24] "Accelerated evolution"                      
## [25] "Postmitotic brain"                          
## [26] "BrainLists.Blalock_AD"                      
## [27] "BrainLists.DiseaseGenes"                    
## [28] "BloodAtlases"                               
## [29] "Verge Disease Genes"                        
## [30] "BloodAtlases.Whitney"                       
## [31] "BrainLists.JAXdiseaseGene"                  
## [32] "BrainLists.MO"                              
## [33] "Age-associated genes"                       
## [34] "BrainLists.Lu_Aging"                        
## [35] "Cell type marker enriched gene sets"        
## [36] "BrainLists.CA1vsCA3"                        
## [37] "BrainLists.MitochondrialType"               
## [38] "BrainLists.MO.2+_26Mar08"                   
## [39] "BrainLists.MO.Sugino"                       
## [40] "BloodAtlases.Gnatenko2"                     
## [41] "BloodAtlases.Kabanova"                      
## [42] "BrainLists.Voineagu"                        
## [43] "StemCellLists"                              
## [44] "StemCellLists.Lee"
# Perform Enrichment Analysis
enrichment = enrichmentAnalysis(classLabels = EA_dataset$module, identifiers = EA_dataset$entrezgene,
                                refCollection = combined_col, #useBackground = 'given', 
                                threshold = 1e-4, thresholdType = 'Bonferroni',
                                getOverlapEntrez = FALSE, getOverlapSymbols = TRUE)
##  enrichmentAnalysis: preparing data..
##   ..working on label set 1 ..


Results


kable(enrichment$enrichmentTable %>% filter(class==top_modules[1]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio,
                    effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[1], ' (MTcor = ',
                       round(genes_info$MTcor[genes_info$Module==top_modules[1]][1],4), ')'))
Enriched terms for module #D69100 (MTcor = 0.9999)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
JAMiller.AIBS.000349 Genes bound by KDM5B in MOUSE MESC from PMID 21448134 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.0000000 0.0000000 1.519917 877 2956 250
JAMiller.AIBS.000095 Cortical PNOC neurons JA Miller at AIBS|Brain|Postnatal brain|Cell type markers|Cortex 0.0000008 0.0000004 1.396110 877 3939 306
JAMiller.AIBS.000277 Genes bound by ELK1 in HUMAN MCF10A from PMID 22589737 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.0000079 0.0000026 2.043817 877 765 87
JAMiller.AIBS.000246 Genes bound by CREM in MOUSE GC1-SPG from PMID 20920259 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.0000184 0.0000046 1.322843 877 4687 345
JAMiller.AIBS.000245 Genes bound by CREB1 in RAT HIPPOCAMPUS from PMID 23762244 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.0001682 0.0000336 1.532112 877 2041 174
JAMiller.AIBS.000557 Genes bound by YY1 in MOUSE MYOBLASTS AND MYOTUBES from PMID 23942234 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.0002229 0.0000371 1.729474 877 1195 115
JAMiller.AIBS.000094 Corticalstriatal neurons JA Miller at AIBS|Brain|Postnatal brain|Cell type markers|Cortex 0.0004419 0.0000631 1.310293 877 4389 320
JAMiller.AIBS.000257 Genes bound by DMRT1 in MOUSE TESTES from PMID 23473982 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.0005357 0.0000670 1.583487 877 1657 146
JAMiller.AIBS.000244 Genes bound by CREB1 in MOUSE GC1-SPG from PMID 20920259 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.0011240 0.0001249 1.445898 877 2461 198
JAMiller.AIBS.000374 Genes bound by MYC in mouse MESC from PMID 19030024 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.0017096 0.0001710 1.397298 877 2881 224
kable(enrichment$enrichmentTable %>% filter(class==top_modules[2]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio,
                    effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[2], ' (MTcor = ',
                       round(genes_info$MTcor[genes_info$Module==top_modules[2]][1],4), ')'))
Enriched terms for module #D873FC (MTcor = -0.9424)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
GO:0047315 kynurenine-glyoxylate transaminase activity GO|GO.MF 1 0.6566050 750.52381 21 1 1
GO:0032190 acrosin binding GO|GO.MF 1 0.8350517 375.26190 21 2 1
GO:0035804 structural constituent of egg coat GO|GO.MF 1 0.8350517 375.26190 21 2 1
GO:0036117 hyaluranon cable GO|GO.CC 1 0.8350517 375.26190 21 2 1
GO:0047804 cysteine-S-conjugate beta-lyase activity GO|GO.MF 1 0.8350517 375.26190 21 2 1
GO:0050252 retinol O-fatty-acyltransferase activity GO|GO.MF 1 0.8350517 375.26190 21 2 1
GO:1900087 positive regulation of G1/S transition of mitotic cell cycle GO|GO.BP 1 0.5582722 41.69577 21 36 2
GO:0030204 chondroitin sulfate metabolic process GO|GO.BP 1 0.6126787 39.50125 21 38 2
GO:0050654 chondroitin sulfate proteoglycan metabolic process GO|GO.BP 1 0.6887348 34.90808 21 43 2
GO:1902808 positive regulation of cell cycle G1/S phase transition GO|GO.BP 1 0.6925801 33.35661 21 45 2

Save Enrichment Analysis results

save(enrichment, file='./../Data/enrichmentAnalysis.RData')
#load('./../Data/enrichmentAnalysis.RData')



Session info

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] org.Hs.eg.db_3.10.0                      
##  [2] BrainDiseaseCollection_1.00              
##  [3] anRichment_1.01-2                        
##  [4] TxDb.Mmusculus.UCSC.mm10.knownGene_3.10.0
##  [5] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2  
##  [6] GenomicFeatures_1.38.2                   
##  [7] GenomicRanges_1.38.0                     
##  [8] GenomeInfoDb_1.22.0                      
##  [9] anRichmentMethods_0.90-1                 
## [10] WGCNA_1.68                               
## [11] fastcluster_1.1.25                       
## [12] dynamicTreeCut_1.63-1                    
## [13] GO.db_3.10.0                             
## [14] AnnotationDbi_1.48.0                     
## [15] IRanges_2.20.2                           
## [16] S4Vectors_0.24.3                         
## [17] Biobase_2.46.0                           
## [18] BiocGenerics_0.32.0                      
## [19] biomaRt_2.42.0                           
## [20] knitr_1.24                               
## [21] doParallel_1.0.15                        
## [22] iterators_1.0.12                         
## [23] foreach_1.4.7                            
## [24] polycor_0.7-10                           
## [25] expss_0.10.1                             
## [26] GGally_1.4.0                             
## [27] gridExtra_2.3                            
## [28] viridis_0.5.1                            
## [29] viridisLite_0.3.0                        
## [30] RColorBrewer_1.1-2                       
## [31] dendextend_1.13.3                        
## [32] plotly_4.9.2                             
## [33] glue_1.3.1                               
## [34] reshape2_1.4.3                           
## [35] forcats_0.4.0                            
## [36] stringr_1.4.0                            
## [37] dplyr_0.8.3                              
## [38] purrr_0.3.3                              
## [39] readr_1.3.1                              
## [40] tidyr_1.0.2                              
## [41] tibble_2.1.3                             
## [42] ggplot2_3.2.1                            
## [43] tidyverse_1.3.0                          
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                backports_1.1.5            
##   [3] Hmisc_4.2-0                 BiocFileCache_1.10.2       
##   [5] plyr_1.8.5                  lazyeval_0.2.2             
##   [7] splines_3.6.0               crosstalk_1.0.0            
##   [9] BiocParallel_1.20.1         robust_0.4-18.2            
##  [11] digest_0.6.24               htmltools_0.4.0            
##  [13] fansi_0.4.1                 magrittr_1.5               
##  [15] checkmate_1.9.4             memoise_1.1.0              
##  [17] fit.models_0.5-14           cluster_2.0.8              
##  [19] annotate_1.64.0             Biostrings_2.54.0          
##  [21] modelr_0.1.5                matrixStats_0.55.0         
##  [23] askpass_1.1                 prettyunits_1.0.2          
##  [25] colorspace_1.4-1            blob_1.2.1                 
##  [27] rvest_0.3.5                 rappdirs_0.3.1             
##  [29] rrcov_1.4-7                 haven_2.2.0                
##  [31] xfun_0.8                    crayon_1.3.4               
##  [33] RCurl_1.95-4.12             jsonlite_1.6               
##  [35] genefilter_1.68.0           impute_1.60.0              
##  [37] survival_2.44-1.1           gtable_0.3.0               
##  [39] zlibbioc_1.32.0             XVector_0.26.0             
##  [41] DelayedArray_0.12.2         DEoptimR_1.0-8             
##  [43] scales_1.1.0                mvtnorm_1.0-11             
##  [45] DBI_1.1.0                   Rcpp_1.0.3                 
##  [47] xtable_1.8-4                progress_1.2.2             
##  [49] htmlTable_1.13.1            foreign_0.8-71             
##  [51] bit_1.1-15.2                preprocessCore_1.48.0      
##  [53] Formula_1.2-3               htmlwidgets_1.5.1          
##  [55] httr_1.4.1                  ellipsis_0.3.0             
##  [57] acepack_1.4.1               farver_2.0.3               
##  [59] pkgconfig_2.0.3             reshape_0.8.8              
##  [61] XML_3.99-0.3                nnet_7.3-12                
##  [63] dbplyr_1.4.2                locfit_1.5-9.1             
##  [65] later_1.0.0                 labeling_0.3               
##  [67] tidyselect_0.2.5            rlang_0.4.4                
##  [69] munsell_0.5.0               cellranger_1.1.0           
##  [71] tools_3.6.0                 cli_2.0.1                  
##  [73] generics_0.0.2              RSQLite_2.2.0              
##  [75] broom_0.5.4                 fastmap_1.0.1              
##  [77] evaluate_0.14               yaml_2.2.0                 
##  [79] bit64_0.9-7                 fs_1.3.1                   
##  [81] robustbase_0.93-5           nlme_3.1-139               
##  [83] mime_0.9                    xml2_1.2.2                 
##  [85] compiler_3.6.0              rstudioapi_0.10            
##  [87] curl_4.3                    reprex_0.3.0               
##  [89] geneplotter_1.64.0          pcaPP_1.9-73               
##  [91] stringi_1.4.6               highr_0.8                  
##  [93] lattice_0.20-38             Matrix_1.2-17              
##  [95] vctrs_0.2.2                 pillar_1.4.3               
##  [97] lifecycle_0.1.0             data.table_1.12.8          
##  [99] bitops_1.0-6                httpuv_1.5.2               
## [101] rtracklayer_1.46.0          R6_2.4.1                   
## [103] latticeExtra_0.6-28         promises_1.1.0             
## [105] codetools_0.2-16            MASS_7.3-51.4              
## [107] assertthat_0.2.1            SummarizedExperiment_1.16.1
## [109] DESeq2_1.26.0               openssl_1.4.1              
## [111] withr_2.1.2                 GenomicAlignments_1.22.1   
## [113] Rsamtools_2.2.2             GenomeInfoDbData_1.2.2     
## [115] hms_0.5.3                   grid_3.6.0                 
## [117] rpart_4.1-15                rmarkdown_1.14             
## [119] Cairo_1.5-10                shiny_1.4.0                
## [121] lubridate_1.7.4             base64enc_0.1-3